# Working with Data
import pandas as pd
import numpy as np
# Data Viz
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import cufflinks as cf
# Plotting on Jupyter
cf.go_offline()
%matplotlib inline
DATA_PATH = '../data/diamonds_train.csv'
df = pd.read_csv(DATA_PATH)
display(df.head())
print('\nShape and Size:')
print(f'Rows: {df.shape[0]}\n'+
f'Cols: {df.shape[1]}\n'+
f'Cels: {df.size}\n')
num_cols = df.select_dtypes(include=['int','float']).columns.to_list()
cat_cols= [col for col in df.columns.to_list() if col not in num_cols]
print(f'{len(num_cols)} Numerical Categories: {num_cols}')
print(f'{len(cat_cols)} Categorical Categories: {cat_cols}')
ax= df.isnull().sum().plot(kind='bar',ylim=(0,len(df)))
ax.set_ylabel('number of mising values')
ax.set_xlabel('columns');
duplicated_df = df[df.duplicated(keep='first')]
print(f'There is {len(duplicated_df)} rows duplicated in this dataset')
duplicated_df.head()
df.drop(duplicated_df.index, inplace=True, errors='ignore')
df['price_by_carat'] = df['price'] / df['carat']
df.head()
df[num_cols].describe().T
❗️ Something to check: looks like there is some values with 0mm in its dimensions
filt_cero_dimension = (df['x'] == 0) | (df['y'] == 0) | (df['z'] == 0)
text = f'There is {len(df[filt_cero_dimension])} rows with 0mm as value in \
one or more of its dimensions.\n'
print(text)
display (df[filt_cero_dimension].head())
df.drop(df[filt_cero_dimension].index, inplace=True)
Quantitative with Dimensions cleaned:
df.describe().T
df[cat_cols].describe().T
f, ax = plt.subplots(figsize=(19, 6))
sns.heatmap(df.corr(), annot=True, linewidths=6, center=0,ax=ax);
cat_fig = df[cat_cols].iplot(kind='hist',
histnorm='percent',
xTitle='Value',
yTitle='Percent',
title='Categorical Values Distribution (%)',
subplots=True)
Diamonds with different proportions and good polish make better use of the light, and will be bright, colorful, and scintillating.
A beautiful diamond looks the way it does because of three optical effects: white light reflections called brightness, flashes of color called fire, and areas of light and dark called scintillation.
Pattern is the relative size, arrangement, and contrast of bright and dark areas that result from a diamond’s internal and external reflections. There must be enough contrast between the bright and dark areas to give the pattern a crisp, sharp look.
fig, ax = plt.subplots(1,2,figsize=(17,4))
sns.countplot(x='cut', data = df,ax=ax[0])
sns.boxplot(x="cut", y="price", data=df, ax=ax[1]);
fig = px.histogram(df, x="price", facet_col="cut",color='cut',title='Price Distribution by Cut')
fig.show()
px.scatter(y=df["carat"], x=df["price"], color=df['cut'],title='Cut Distribution by Price and Carat',
labels={
"x": "Price",
"y": "Carat",
'color':'Cut Type'
})

Among other things, blemishes include scratches and nicks on a diamond’s surface. Inclusions are generally on the inside, and some might break the surface of the stone. Sometimes, tiny diamond or other mineral crystals are trapped inside a diamond when it forms. Depending on where they’re located, they might remain after the stone has been cut and polished, and they can affect a diamond’s appearance.
Clarity characteristics might have a negative influence on a diamond’s value, but they can have positive effects as well. For one thing, they help gemologists separate diamond from imitations.
There are 11 clarity grades in the GIA clarity grading system. They are Flawless, Internally Flawless, two categories of Very, Very Slightly Included, two categories of Slightly Included, and three categories of Included. The effect of a clarity characteristic on the clarity grade is based on its size, number, position, nature, and color or relief.
🙋🏻♂️: In order to make it more clear for non diamonds experts, lets rename the categories dependig of its flawless level.
new_names = {'IF':'01_IF','VVS1':'02_VVS1',
'VVS2':'03_VVS2','VS1':'04_VS1',
'VS2':'05_VS2','SI1':'06_SI1',
'SI2':'07_SI2','I1':'08_I1'}
df.replace({'clarity':new_names}, inplace=True)
clartity_order_list= list(df['clarity'].unique())
clartity_order_list.sort()
fig, ax = plt.subplots(1,2,figsize=(15,4))
sns.countplot(x='clarity', data = df,ax=ax[0], order=clartity_order_list)
sns.boxplot(x="clarity", y="price", data=df, ax=ax[1],order=clartity_order_list)
plt.tight_layout()
fig = px.histogram(df, x="price", facet_col="clarity",color='clarity',title='Price Distribution by Clarity',template="plotly_white")
fig.show()
px.scatter(y=df["carat"], x=df["price"], color=df['clarity'], title='Clarity Distribution by Price and Carat',
labels={
"x": "Price",
"y": "Carat",
'color':'Clarity Level'
},template="plotly_white")
px.scatter(y=df["carat"], x=df["price_by_carat"], color=df['clarity'],title='Clarity Distribution by Price for Carat and Carat',
labels={
"x": "Price by Carat",
"y": "Carat",
'color':'Clarity Level'
},template="plotly_white")
Subtle differences in color can dramatically affect diamond value. Two diamonds of the same clarity, weight, and cut can differ in value based on color alone. Even the slightest hint of color can make a dramatic difference in value.
Diamonds that range from colorless to light yellow and brown fall within the normal color range. Within that range, colorless diamonds are the most rare, so they’re the most valuable. They set the standard for grading and pricing other diamonds in the normal color range.
A chemically pure and structurally perfect diamond has no hue, like a drop of pure water, and consequently, a higher value. GIA's D-to-Z color-grading system measures the degree of colorlessness by comparing a stone under controlled lighting and precise viewing conditions to masterstones of established color value.
count_df = df[['color','x']].groupby(['color']).count().reset_index()
fig = make_subplots(rows=1, cols=2, subplot_titles=('Diamond by Color','Color Statistics'))
fig.add_trace(go.Bar(x= count_df['color'], y=count_df['x'],name='Count Graph'),row=1, col=1)
fig.add_trace(go.Box(x=df['color'], y=df['price'],name='Statistics Graph'), row=1, col=2)
fig = px.histogram(df, x="price", facet_col="color",category_orders={"color": ["D", "E", "F", "G",'H','I','J']},
color='color',title='Price Distribution by Color',template="plotly_white")
fig.show()
fig = px.histogram(df, x="price_by_carat", facet_col="color",category_orders={"color": ["D", "E", "F", "G",'H','I','J']},
title='Price by Carat Distribution for Clarity',template="plotly_white")
fig.show()
px.scatter(y=df["carat"], x=df["price"], color=df['color'],title='Color Distribution by Price and Carat',
labels={
"x": "Price",
"y": "Carat",
},template="plotly_white")
dimensions = ['x','y','z']
no_dimensions = [_ for _ in num_cols if _ not in dimensions]
num_fig = df[no_dimensions].iplot(kind='hist',
xTitle='Counts',
yTitle='Values',
title='Numerical Values Distribution',
subplots=True)
A metric “carat” is defined as 200 milligrams. Each carat is subdivided into 100 ‘points.’ This allows very precise measurements to the hundredth decimal place. A jeweler may describe the weight of a diamond below one carat by its ‘points’ alone. For instance, the jeweler may refer to a diamond that weighs 0.25 carats as a ‘twenty-five pointer.’ Diamond weights greater than one carat are expressed in carats and decimals. A 1.08 carat stone would be described as ‘one point oh eight carats.’
Some weights are considered “magic sizes” – half carat, three-quarter carat, and carat. Visually, there’s little difference between a 0.99 carat diamond and one that weighs a full carat. But the price differences between the two can be significant.
px.scatter(df, y="carat", x="price",
trendline="ols",trendline_color_override='red',
opacity=0.5, title='Price/Carat (linear Scale)',template="plotly_white")
px.histogram(df,x="carat",template="plotly_white")
px.histogram(df,x="price_by_carat",template="plotly_white")
a=px.scatter(df, y="depth", x="price", title='Distribution of Price by Depth and Color',color='color',template="plotly_white")
b=px.scatter(df, x="depth", y="price_by_carat", title='Distribution of Depth by Price by Carat and Color',color='color',template="plotly_white")
display(a,b)
a=px.scatter(df, y="table", x="price", title='Distribution of Price by Table and Color',color='color',template="plotly_white")
b=px.scatter(df, x="table", y="price_by_carat", title='Distribution of Depth by Price by Carat and Color',color='color',template="plotly_white")
display(a,b)
px.histogram(df, x="price", title='Distribution of Price',template="plotly_white")
px.box(df, x="price", title='General Statistics of Prce Column',template="plotly_white")
mean_prices = df.groupby(["cut","color"])["price"].mean().reset_index()
fig = px.scatter_3d(mean_prices, x = 'cut',y = 'color',z = 'price', color = 'color', size = 'price', hover_name = 'price')
fig.show()
fig = make_subplots(rows=1, cols=3, subplot_titles=('Length','Width','Height'))
fig.add_trace(go.Box(x=df['x'],name='Length'),row=1,col=1)
fig.add_trace(go.Box(x=df['y'],name='Width'),row=1,col=2)
fig.add_trace(go.Box(x=df['z'],name='Height'),row=1,col=3)
fig.show()
num_fig = df[dimensions].iplot(kind='hist',
xTitle='Dimension',
yTitle='Count',
title='Size Values Distribution',
subplots=True)
fig = px.scatter_3d(df, x = 'x',y = 'y',z = 'z', color = 'color', size = 'price', hover_name = 'price')
fig.show()
Exporting data to a CSV to make a exporatory dashboard in Tableau
df.to_csv('../data/clean/diamonds_wrangled.csv', index=False)